So far, I have trained two machined learning models to explain the impact of access to transit on the proportion of residents using public transportation, down to the DA level. I also tuned their hyperparameters using cross validation. They have both achieved cross-validation $R^2$ at about 0.67.
However, there is still a big distance between our models and actionable real-world solutions. At the end of the day, we want to know where the transit authority of Greater Vancouver Area should put resource to develop new infrastructure.
import altair as alt
import esda
import geopandas
import json
import joblib
import libpysal as lps
import os
import numpy as np
import pandas as pd
import shap
# sklearn modules
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Lasso
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (
OneHotEncoder,
OrdinalEncoder,
StandardScaler,
MinMaxScaler,
)
# Mapping modules
import contextily as ctx
import matplotlib.pyplot as plt
GVA_map_xlim_lower = -13746072.435927173
GVA_map_xlim_higher = -13630000
GVA_map_ylim_lower = 6270302.809935683
GVA_map_ylim_higher = 6345000
data_version = "20200606"
## Data
X_train = geopandas.read_file(
os.path.join("Data_Tables", data_version, "X_train.json")
)
with open(os.path.join("Data_Tables", data_version, "X_header.json"), "r") as X_header_outfile:
X_header = json.load(X_header_outfile)
with open(os.path.join("Data_Tables", data_version, "y_train.json"), "r") as y_train_outfile:
y_train = json.load(y_train_outfile)
y_train = pd.read_json(y_train, typ="series")
# Read data and models
## Trained models
random_search_rf = joblib.load(os.path.join(
"Models",
data_version,
"random_search_rf.joblib",
))
random_search_LASSO = joblib.load(os.path.join(
"Models",
data_version,
"random_search_LASSO.joblib",
))
## Pre-processor
preprocessor, categorical_transformer, numeric_transformer, proportion_transformer, ColumnTransformer = joblib.load(
os.path.join("Models", data_version, "preprocessor.joblib")
)
## Variable types
with open(
os.path.join("Models", data_version, "features.json"),
"r",
) as feature_outfile:
categorical_features, numeric_features, proportion_features, geometry_feature = json.load(feature_outfile)
## LASSO model
pipe_LASSO = Pipeline(
steps=[
("preprocessor", preprocessor),
("LASSO_reg", Lasso()),
]
)
## Random Forest model
pipe_rf = Pipeline(
steps=[
("preprocessor", preprocessor),
("rf_reg", RandomForestRegressor()),
]
)
## Get input feature names
def get_column_names_from_ColumnTransformer(column_transformer):
col_name = []
for transformer_in_columns in column_transformer.transformers_[
:-1
]: # the last transformer is ColumnTransformer's 'remainder'
print("\n\ntransformer: ", transformer_in_columns[0])
raw_col_name = list(transformer_in_columns[2])
if isinstance(transformer_in_columns[1], Pipeline):
# if pipeline, get the last transformer
transformer = transformer_in_columns[1].steps[-1][1]
else:
transformer = transformer_in_columns[1]
try:
if isinstance(transformer, OneHotEncoder):
names = list(transformer.get_feature_names(raw_col_name))
elif isinstance(transformer, SimpleImputer) and transformer.add_indicator:
missing_indicator_indices = transformer.indicator_.features_
missing_indicators = [
raw_col_name[idx] + "_missing_flag"
for idx in missing_indicator_indices
]
names = raw_col_name + missing_indicators
else:
names = list(transformer.get_feature_names())
except AttributeError as error:
names = raw_col_name
col_name.extend(names)
return col_name
preprocessor.fit(X_train)
feature_names = get_column_names_from_ColumnTransformer(preprocessor)
categorical_feature_names = list(preprocessor.named_transformers_["cat"]["onehot"].get_feature_names(categorical_features))
numeric_feature_names = numeric_features
proportion_feature_names = proportion_features
How are errors distributed across DAs? We can see below that errors of both models are close to normally distributed.
preprocessed_data = pd.DataFrame(
data=preprocessor.transform(X_train),
columns=feature_names,
index=X_train.index,
)
### Get predictions
preprocessed_EA = preprocessed_data.copy()
preprocessed_EA["pred_LASSO"] = random_search_LASSO.best_estimator_["LASSO_reg"].predict(preprocessed_data)
preprocessed_EA["error_LASSO"] = preprocessed_EA["pred_LASSO"] - y_train
preprocessed_EA["pred_rf"] = random_search_rf.best_estimator_["rf_reg"].predict(preprocessed_data)
preprocessed_EA["error_rf"] = preprocessed_EA["pred_rf"] - y_train
### Histograms
LASSO_error_chart = alt.Chart(preprocessed_EA).mark_bar().encode(
alt.X("error_LASSO:Q", bin=alt.Bin(maxbins=30), title = "LASSO model error (binned)"),
y='count()',
).properties(title="Distribution of LASSO model errors")
LASSO_error_chart.save(
os.path.join("Figures", data_version, "LASSO_error_chart.png")
)
rf_error_chart = alt.Chart(preprocessed_EA).mark_bar().encode(
alt.X("error_rf:Q", bin=alt.Bin(maxbins=30), title = "Random Forest model error (binned)"),
y='count()',
).properties(title="Distribution of Random Forest model errors")
rf_error_chart.save(
os.path.join("Figures", data_version, "LASSO_error_chart.png")
)
X_train_EA = X_train.copy()
X_train_EA["error_LASSO"] = preprocessed_EA["error_LASSO"]
X_train_EA["error_rf"] = preprocessed_EA["error_rf"]
LASSO_error_ax = X_train_EA.plot(
figsize=(20, 20),
alpha=0.5,
column="error_LASSO",
cmap="bwr",
legend=True,
legend_kwds={'shrink': 0.5},
)
LASSO_error_ax.set_xlim(GVA_map_xlim_lower, GVA_map_xlim_higher)
LASSO_error_ax.set_ylim(GVA_map_ylim_lower, GVA_map_ylim_higher)
ctx.add_basemap(LASSO_error_ax, zoom=12)
plt.title("LASSO model error", fontsize=30)
plt.savefig(
os.path.join(
"Maps",
data_version,
"LASSO_error.png",
)
)
rf_error_ax = X_train_EA.plot(
figsize=(20, 20),
alpha=0.5,
column="error_LASSO",
cmap="bwr",
legend=True,
legend_kwds={'shrink': 0.5},
)
rf_error_ax.set_xlim(GVA_map_xlim_lower, GVA_map_xlim_higher)
rf_error_ax.set_ylim(GVA_map_ylim_lower, GVA_map_ylim_higher)
ctx.add_basemap(rf_error_ax, zoom=12)
plt.title("Random Forest model error", fontsize=30)
plt.savefig(
os.path.join(
"Maps",
data_version,
"rf_error.png",
)
)
Howeverm a formal Moran’s I test shows that both our models have spatial autocorrelation problems. The auto-correlation coefficients are not themselves high (0.05 and 0.18 for LASSO and Random Forest models, respectively). However, the pseudo p-values are at 0.001.
##### Create queen contiguity
wq = lps.weights.Queen.from_dataframe(X_train_EA)
wq.transform = 'r'
LASSO_errors = preprocessed_EA["error_LASSO"]
rf_errors = preprocessed_EA["error_rf"]
##### Get Moran’s I
###### LASSO model
mi_LASSO = esda.moran.Moran(LASSO_errors, wq)
mi_LASSO.I
mi_LASSO.p_sim
###### Random Forest model
mi_rf = esda.moran.Moran(rf_errors, wq)
mi_rf.I
mi_rf.p_sim
Both LASSO and Random Forest models give easy access to measurements of global feature importances. For LASSO model, I use the magnificance of coefficients to roughly estimate each variable's importance. For Random Forest model, I use the impurity measurement of each variable's importance.
In addition, the three types of input variables, namely categorical_features, numeric_features, and proportion_features are scaled differently in the preprocessing step. Therefore, I will review the important variables for all three categories separately.
### LASSO Model
LASSO_coeffs = random_search_LASSO.best_estimator_["LASSO_reg"].coef_
LASSO_feature_coeffs = pd.DataFrame({"feature": feature_names, "coeffs": LASSO_coeffs})
### Random Forest Model
rf_immpurity_feat_imp = random_search_rf.best_estimator_["rf_reg"].feature_importances_
rf_immpurity_feat_imp_coeffs = pd.DataFrame(
{"feature": feature_names, "impurity_importance": rf_immpurity_feat_imp}
)
The table below shows the five categorical features that mostly strongly predict proportion of transit use among residents. They are ADA or CCS areas.
#### Categorical features
LASSO_categorical_feature_coeffs = LASSO_feature_coeffs.loc[LASSO_feature_coeffs["feature"].isin(categorical_feature_names), :]
LASSO_categorical_feature_coeffs.sort_values("coeffs", ascending = False).head(5)
X_train_LASSO_top_categorical_mask = ((X_train.ADAUID == "59150117")
| (X_train.CCSUID == "5915022")
| (X_train.CCSUID == "5915025"))
X_train_LASSO_top_categorical = X_train.loc[X_train_LASSO_top_categorical_mask, :]
LASSO_top_categorical_ax = X_train_LASSO_top_categorical.plot(
figsize=(20, 20),
alpha=0.5,
color="red"
)
ctx.add_basemap(LASSO_top_categorical_ax, zoom=12)
plt.title("Predicting High Transit Use: Categorical Variable (LASSO Model)", fontsize=30)
plt.savefig(
os.path.join(
"Maps",
data_version,
"LASSO_top_categorical.png",
)
)
As shown in the map above, areas in the city of Vancouver tend to have high proportions of people using public transit.
LASSO_categorical_feature_coeffs.sort_values("coeffs").head(5)
X_train_LASSO_bottom_categorical_mask = ((X_train.ADAUID == "59150035")
| (X_train.ADAUID == "59150040")
| (X_train.ADAUID == "59150042"))
X_train_LASSO_bottom_categorical = X_train.loc[X_train_LASSO_bottom_categorical_mask, :]
LASSO_bottom_categorical_ax = X_train_LASSO_bottom_categorical.plot(
figsize=(20, 20),
alpha=0.5,
color="red"
)
ctx.add_basemap(LASSO_bottom_categorical_ax, zoom=14)
plt.title("Predicting low Transit Use: Categorical Variable (LASSO Model)", fontsize=30)
plt.savefig(
os.path.join(
"Maps",
data_version,
"LASSO_bottom_categorical.png",
)
)
Somehow unexpectedly, areas that most strongly predict low public transit use are also in the downtown area. They are distributed along Thurlow Street. It is important they are also in the CCS which predicts high transit use. In other words, these areas may have lower transit use than their immediate neighbors, but not necessarily compared to other ares in GVA.
The table below shows the five categorical features that mostly strongly impact proportion of transit use among residents. They are CSD or CCS areas. It is worth noting that we cannot know the direction of impact from these inpurity measures.
#### Categorical features
rf_categorical_feature_imps = rf_immpurity_feat_imp_coeffs.loc[rf_immpurity_feat_imp_coeffs["feature"].isin(categorical_feature_names), :]
rf_categorical_feature_imps.sort_values("impurity_importance", ascending = False).head(5)
X_train_rf_imp_categorical_mask = ((X_train.CSDUID == "5915022")
| (X_train.CCSUID == "5915025")
| (X_train.CCSUID == "5915001"))
X_train_rf_imp_categorical = X_train.loc[X_train_rf_imp_categorical_mask, :]
rf_imp_categorical_ax = X_train_rf_imp_categorical.plot(
figsize=(20, 20),
alpha=0.5,
color="orange"
)
ctx.add_basemap(rf_imp_categorical_ax, zoom=12)
plt.title("Predicting Transit Use: Important Categorical Variables (Random Forest Model)", fontsize=30)
plt.savefig(
os.path.join(
"Maps",
data_version,
"rf_imp_categorical.png",
)
)
If we can take a guess, however, areas in the city of Vancouver probably tend to have higher rates of transit use. By contrast, The east part of Langley city and Aldergrove probably have low transit use. We will know more details about each feature's impact later using SHAP.
#### LASSO Model
LASSO_numeric_feature_coeffs = LASSO_feature_coeffs.loc[LASSO_feature_coeffs["feature"].isin(numeric_feature_names), :].copy()
LASSO_numeric_feature_coeffs["explanation"] = list(map(lambda col_name: X_header[X_train.columns.get_loc(col_name)], LASSO_numeric_feature_coeffs["feature"]))
LASSO_numeric_feature_coeffs.sort_values("coeffs", ascending=False).head(5).drop("feature", axis = 1)
for exp in LASSO_numeric_feature_coeffs.sort_values("coeffs", ascending=False).head(5).explanation:
print(exp)
As shown in the table and list above, important numeric factors that indicate strong transit use include:
(1) Large of immigrant population who moved to Canada when they were 25 to 44 years old,
(2) High labor participation rate (number of people working as a percentage of total population),
(3) High number of people who do not live in families,
(4) High number of people of British ethnicity, and
(5) High number of people who spend less money on housing as compared to their income.
LASSO_numeric_feature_coeffs.sort_values("coeffs", ascending=True).head(5).drop("feature", axis = 1)
for exp in LASSO_numeric_feature_coeffs.sort_values("coeffs", ascending=True).head(5).explanation:
print(exp)
As shown in the table and list above, the following numeric features most strongly predict low transit use:
(1) High employment rate (number of people actually employed as a percentage of people who want to be employed),
(2) Large self-employed population,
(3) Large number of people working in the same CSD,
(4) High average value of dwellings, and
(5) High average housing cost for rented dwellings.
We would also be interested in knowing where the two variables that we are interested in rank among the numeric features. The answer is given below:
rank_services_LASSO = list(LASSO_numeric_feature_coeffs.sort_values("coeffs", ascending=False)["feature"]).index("NBA_services_PC")
rank_stops_LASSO = list(LASSO_numeric_feature_coeffs.sort_values("coeffs", ascending=False)["feature"]).index("NBA_stops_PC")
print(f"Among {LASSO_numeric_feature_coeffs.shape[0]} numeric features, number of services percapita ranks {rank_services_LASSO}, coefficient is {list(LASSO_numeric_feature_coeffs.loc[LASSO_numeric_feature_coeffs['feature'] == 'NBA_services_PC', 'coeffs'])[0]:.4f}. Number of stops per capite ranks {rank_stops_LASSO}, and coefficient is {list(LASSO_numeric_feature_coeffs.loc[LASSO_numeric_feature_coeffs['feature'] == 'NBA_stops_PC', 'coeffs'])[0]:.4f}.")
#### Numeric features
rf_numeric_feature_imps = rf_immpurity_feat_imp_coeffs.loc[rf_immpurity_feat_imp_coeffs["feature"].isin(numeric_feature_names), :].copy()
rf_numeric_feature_imps["explanation"] = list(map(lambda col_name: X_header[X_train.columns.get_loc(col_name)], rf_numeric_feature_imps["feature"]))
rf_numeric_feature_imps.sort_values("impurity_importance", ascending=False).head(5).drop("feature", axis = 1)
for exp in rf_numeric_feature_imps.sort_values("impurity_importance", ascending=False).head(5).explanation:
print(exp)
The table and list above show the important numeric features in our Random Forest model. Population density turns out to be the most important feature. Also, much to our pleasure, number of services per capita, the variable that we are interested in, ranks the second in terms of importance.
Other important features include number of government transfer recipients, high number of single-detached house, and very old house dwellings.
#### LASSO Model
LASSO_proportion_feature_coeffs = LASSO_feature_coeffs.loc[LASSO_feature_coeffs["feature"].isin(proportion_feature_names), :].copy()
LASSO_proportion_feature_coeffs["denominator"] = list(map(lambda col_name: X_header[X_train.columns.get_loc(col_name.split("_")[-1])], LASSO_proportion_feature_coeffs["feature"]))
LASSO_proportion_feature_coeffs["numerator"] = list(map(lambda col_name: X_header[X_train.columns.get_loc(col_name.split("_")[0])], LASSO_proportion_feature_coeffs["feature"]))
LASSO_proportion_feature_coeffs.sort_values("coeffs", ascending=False).head(5).drop("feature", axis = 1)
for denominator_exp, numerator_exp in zip(LASSO_proportion_feature_coeffs.sort_values("coeffs", ascending=False).head(5).denominator, LASSO_proportion_feature_coeffs.sort_values("coeffs", ascending=False).head(5).numerator):
print(f"Denominator is {denominator_exp};")
print(f"Numerator is {numerator_exp}.")
print("")
As shown in the table and list above, strong predictors for high transit use, among proportion variables, include:
(1) High proportion of people working in sales and service industries,
(2) High proportion of people working at a constant and usual location,
(3) High proportion of people with East and Southeast Asian ethnic origins,
(4) High proportion of people commuting to a different CSD but different CD for work,
(5) High proportion of households that have couples with children.
LASSO_proportion_feature_coeffs.sort_values("coeffs", ascending=True).head(5).drop("feature", axis = 1)
for denominator_exp, numerator_exp in zip(LASSO_proportion_feature_coeffs.sort_values("coeffs", ascending=True).head(5).denominator, LASSO_proportion_feature_coeffs.sort_values("coeffs", ascending=True).head(5).numerator):
print(f"Denominator is {denominator_exp};")
print(f"Numerator is {numerator_exp}.")
print()
By contrast, strong predictors for low transit use, among proportion variables, include:
(1) High proportion of people who live in a dwelling which they own,
(2) High proportion of households where three or more generations live together,
(3) High number of immigrants who are born in India, as a proportion of all immigrants born in Asia,
(4) High proportion of tenent households spending 30% or more of its income on shelter costs, and
(5) High number of internal migrants as a proportion of all migrants.
#### Proportion features
rf_proportion_feature_imps = rf_immpurity_feat_imp_coeffs.loc[rf_immpurity_feat_imp_coeffs["feature"].isin(proportion_feature_names), :].copy()
rf_proportion_feature_imps["denominator"] = list(map(lambda col_name: X_header[X_train.columns.get_loc(col_name.split("_")[-1])], rf_proportion_feature_imps["feature"]))
rf_proportion_feature_imps["numerator"] = list(map(lambda col_name: X_header[X_train.columns.get_loc(col_name.split("_")[0])], rf_proportion_feature_imps["feature"]))
rf_proportion_feature_imps.sort_values("impurity_importance", ascending=False).head(5).drop("feature", axis = 1)
for denominator_exp, numerator_exp in zip(rf_proportion_feature_imps.sort_values("impurity_importance", ascending=False).head(5).denominator, rf_proportion_feature_imps.sort_values("impurity_importance", ascending=False).head(5).numerator):
print(f"Denominator is {denominator_exp};")
print(f"Numerator is {numerator_exp}.")
print()
In our random forest model, marital status of residents have the most impact on use of public transit. Other important proportion features include:
(1) Number of immigrants from East and Southeast Asia as a proportion of all immigrants,
(2) Proportion of residents speaking more than two languages at home, with the second language not being an official language.
(3) Proportion of residents who only speak one language at home.
Here, we use the SHapley Additive exPlanations (SHAP) tool to further understand the effects of features in our Random Forest model.
# Read shap_values
with open(os.path.join
(
"Models",
data_version,
"shap_values.npy",
),
'rb'
) as f:
shap_values = np.load(f)
The variable that we are most interested in, of course, is number of services per capita in the neighborhood area NBA_services_PC. Two takeaways are worth of mentioning here:
(1) The relationship between NBA_services_PC and transit use is positive, with the only exception of very large NBA_services_PC values.
(2) Among other variables, vn34_ultimate_vn33 (proporion of married or common-law couples among all residents) has highest frequency of interaction with NBA_services_PC. The positive relationship between NBA_services_PC and transit use seems to be more positive for areas with high proportions of married people.
shap.dependence_plot("NBA_services_PC", shap_values, preprocessed_data)
plt.savefig(os.path.join
(
"Figures",
data_version,
"shap_NBA_services.png",
)
)
X_header[X_train.columns.get_loc("vn34")]
X_header[X_train.columns.get_loc("vn33")]
Another variable that we will be interested in is NBA_stops_PC, the number of stops in the neighhood area. As we can see in the following plot, it also has a positive, albeit weaker relationship with transit use. Also interestingly, there seems to be a strong case of interaction between NBA_stops_PC and vn142_ultimate_vn131, which is the proportion of people who speak an non-official language as their mother tougue. For DAs with more presence of such population, the relationship between transit stops and public transportation use seems stronger.
shap.dependence_plot("NBA_stops_PC", shap_values, preprocessed_data)
X_header[X_train.columns.get_loc("vn207")]
X_header[X_train.columns.get_loc("vn131")]
To gain an understanding of key features that impact transit use at the DA level, I have made the charts below.
shap.summary_plot(shap_values, preprocessed_data)
We have not figured out where GVA's public transportation agency should prioritize in terms of service and infrastructure development yet. However, the analyses above provide valuable information abour the relationship between demographic characters and transit use in GVA, especially the role of access to public transportation services and stops.
Geographically speaking, areas in and around the downtown core tend have higher public transit use, something hardly surprising given our prior analyses on the spatial distribution of transit services in the area. However, variations within the downtown core should not be overlooked.
In addition to geographical location, factors that impact transit use are diverse, distributed across the variables covered by the census. DA level variables in ethnicity, family structure, housing, immigration, labor and language all make to the top of importance.
Access to transit does seem to affect people's transit use, with all other variables above taken into consideration. Both of our models agree that number of services percapita in a DA's neighborhood area is a more important feature than the number of stops per capita. Our LASSO model does not even select the number of stops per capita as a variable. This seems to suggest that the authority should pour more of their resources into strengthening transit services at existing locations, instead of setting up new stops.
Exactly how important are transit services and stops? In fact, the two models that we select diverge to a certain degree. What we can be confident about for now, however, is that number of transit services per capita is among the 5% most important numeric features, and is related to transit use positively.
If the public transportation authority of GVA does indeed increase service in the key areas, how much increase of transit use can we expect? A more fundamental question is that, although increase in access to public transit does have a positive impact on use of transit in commuting, is it worth it for GVA's public transportation agency to invest in increasing transit services? Given the economic law of diminishing returns, there will be a extent beyond which it is no longer wise to increase transit services. Where is this critical point?
This is an important, but difficult, question to ask, and the answer provided here is imperfect. However, it at least gives a sense of the power of machine learning methods in public transit planning. I will bring back the whole dataset and identify areas that GVA's transportation agency should pour its resources to. I will identify such areas in two ways:
(1) I will create a hypothetical dataset (X_1) where each DA's NBA_services_PC increases by a fixed amount (10% of average current NBA_services_PC across all DAs in GVA, or about 1.2). I will then identify DAs where transit use rate increases the most (predicted transit use with increased service per capita, as compared to predicted transit use in the current scenario), measured by percentage point increase.
(2) I will create a hypothetical dataset (X_2) where each DA's NBA_services_PC increases by a fixed percentage (10%). I will then identify as a percentage of current transit use rate increases the most (predicted transit use with increased service per capita, as compared to predicted transit use in the current scenario), measured by percentage increase.
Although we have built two models above, in both scenarios, LASSO model will not be helpful. This is becuase LASSO is essentially a linear model, and thus change in output is linearly dependent on change in input, without interaction. In other words, without experimentation, we know that in scenario (1), transit use will increase by the same amount across all DAs, wheares in scenario (2), transit use will increase the most where it is already high.
Therefore, in analyses below, only Random Forest model will be used.
### Read in whole dataset
df_full = geopandas.read_file(
os.path.join("Data_Tables", data_version, "GVA_DA_Modeling.json")
)
y = df_full["prop_public"]
X = df_full.drop(["prop_public"], axis=1)
#### Get predictions on whole dataset
X_preprocessed = preprocessor.transform(X)
X_pred_rf = random_search_rf.best_estimator_["rf_reg"].predict(X_preprocessed)
#### Make hypothetical datasets
X_1 = X.copy()
delta = 0.1 * X_1.loc[:, "NBA_services_PC"].mean()
X_1.loc[:, "NBA_services_PC"] = X_1.loc[:, "NBA_services_PC"] + delta
X_1_preprocessed = preprocessor.transform(X_1)
X_2 = X.copy()
X_2.loc[:, "NBA_services_PC"] = X_2.loc[:, "NBA_services_PC"] * 1.1
X_2_preprocessed = preprocessor.transform(X_2)
In Scenario 1, DAs are identified where increase of transit service per capita by a fixed amount (10% of average current NBA_services_PC across all DAs in GVA), will result in the most (top top_pct_s1, as a percentage) increase in proportion of people using public transit. If such an increase does happen to these DAs, how much change to transit use will happen exactly? For the sake of demonstration, we set top_pct_s1 to 10%.
top_pct_s1 = 0.1
### X_1
X_1_pred_rf = random_search_rf.best_estimator_["rf_reg"].predict(X_1_preprocessed)
X_1["rf_pp_new"] = X_1_pred_rf
X_1["rf_pp_increase"] = X_1_pred_rf - X_pred_rf
X_1["rf_pp_increase_top"] = X_1["rf_pp_increase"] > X_1["rf_pp_increase"].quantile(1 - top_pct_s1)
X_1_rf_ax = X_1.plot(
figsize=(20, 20),
alpha=0.5,
column="rf_pp_increase_top",
legend=True,
categorical=True,
cmap="Paired",
)
X_1_rf_ax.set_xlim(GVA_map_xlim_lower, GVA_map_xlim_higher)
X_1_rf_ax.set_ylim(GVA_map_ylim_lower, GVA_map_ylim_higher)
ctx.add_basemap(X_1_rf_ax, zoom=12)
plt.title(f"DAs with Top {top_pct_s1:.1%} Transit Use Increase: Random Forest Model, 1.2 Increase in Service per Capita", fontsize=20)
plt.savefig(
os.path.join(
"Maps",
data_version,
"X_1_rf.png",
)
)
y[X_1["rf_pp_increase_top"] == True].mean()
X_1.loc[X_1["rf_pp_increase_top"] == True, "rf_pp_increase"].mean()
In scenario 1, identified DAs will see about 0.51 percentage points' increase in porportion of transit users. Given that in average, these DAs have about 12.2% commuters using public transit, this does not seem to be a really great increase. Maybe we should narrow the scope of DAs to pour resources to? Indeed, we can adjust the percentage of DAs to pour resource to, which currently stands at 10%. The following chart summarizes this.
top_pcts_s1 = np.arange(0.005, 0.2, 0.005)
X_1_eva_results = {"top_pct_s1": [], "current_percentage": [], "percentage_point_increase":[], "total_population":[]}
for top_pct_s1 in top_pcts_s1:
mask = X_1["rf_pp_increase"] > X_1["rf_pp_increase"].quantile(1 - top_pct_s1)
X_1_eva_results["top_pct_s1"].append(top_pct_s1)
X_1_eva_results["current_percentage"].append(y[mask == True].mean())
X_1_eva_results["percentage_point_increase"].append(X_1.loc[mask == True, "rf_pp_increase"].mean()*100)
X_1_eva_results["total_population"].append(X_1.loc[mask == True, "vn13"].sum())
X_1_eva_results = pd.DataFrame(X_1_eva_results)
X_1_eva_results["percent_increase"] = X_1_eva_results["percentage_point_increase"]/ 100 / X_1_eva_results["current_percentage"]
alt.Chart(X_1_eva_results).mark_line().encode(
x=alt.X("total_population", title="Total population of DAs selected"),
y=alt.Y("current_percentage", title="Average current commute using percentage", axis=alt.Axis(format='%')),
).properties(
title='Impacts of 1.2 increase in number of services per capita'
)
alt.Chart(X_1_eva_results).mark_line().encode(
x=alt.X("total_population", title="Total population of DAs selected"),
y=alt.Y("percentage_point_increase", title="Percentage point increase in transit use"),
).properties(
title='Impacts of 1.2 increase in number of services per capita'
)
alt.Chart(X_1_eva_results).mark_line().encode(
x=alt.X("total_population", title="Total population of DAs selected"),
y=alt.Y("percent_increase", title="Increase as percent of current transit use", axis=alt.Axis(format='%')),
).properties(
title='Impacts of 1.2 increase in number of services per capita'
)
In Scenario 2, DAs are identified where increase of transit service per capita by a fixed percent (10%) will result in the most (top 10%) increase in proportion of people using public transit.
top_pct_s2 = 0.1
### X_2
X_2_pred_rf = random_search_rf.best_estimator_["rf_reg"].predict(X_2_preprocessed)
X_2["rf_pp_new"] = X_2_pred_rf
X_2["rf_pp_increase"] = X_2_pred_rf - X_pred_rf
X_2["rf_pp_increase_top"] = X_2["rf_pp_increase"] > X_2["rf_pp_increase"].quantile(1 - top_pct_s2)
X_2_rf_ax = X_2.plot(
figsize=(20, 20),
alpha=0.5,
column="rf_pp_increase_top",
legend=True,
categorical=True,
cmap="Paired",
)
X_2_rf_ax.set_xlim(GVA_map_xlim_lower, GVA_map_xlim_higher)
X_2_rf_ax.set_ylim(GVA_map_ylim_lower, GVA_map_ylim_higher)
ctx.add_basemap(X_2_rf_ax, zoom=12)
plt.title(f"DAs with Top 10% Transit Use Increase: Random Forest Model, {top_pct_s2:.1%} Increase in Service per Capita", fontsize=20)
plt.savefig(
os.path.join(
"Maps",
data_version,
"X_2_rf.png",
)
)
y[X_2["rf_pp_increase_top"] == True].mean()
X_2.loc[X_2["rf_pp_increase_top"] == True, "rf_pp_increase"].mean()
Results that we see from the second scenatio are indeed quiet similar to the first one. Under the Random Forest model, we are expected to see about 0.35 percentage points' increase in proportion of transit use in identified DAs.
top_pcts_s2 = np.arange(0.005, 0.2, 0.005)
X_2_eva_results = {"top_pct_s2": [], "current_percentage": [], "percentage_point_increase":[], "total_population":[]}
for top_pct_s2 in top_pcts_s2:
mask = X_2["rf_pp_increase"] > X_2["rf_pp_increase"].quantile(1 - top_pct_s2)
X_2_eva_results["top_pct_s2"].append(top_pct_s2)
X_2_eva_results["current_percentage"].append(y[mask == True].mean())
X_2_eva_results["percentage_point_increase"].append(X_2.loc[mask == True, "rf_pp_increase"].mean()*100)
X_2_eva_results["total_population"].append(X_2.loc[mask == True, "vn13"].sum())
X_2_eva_results = pd.DataFrame(X_2_eva_results)
X_2_eva_results["percent_increase"] = X_2_eva_results["percentage_point_increase"]/ 100 / X_2_eva_results["current_percentage"]
alt.Chart(X_2_eva_results).mark_line().encode(
x=alt.X("total_population", title="Total population of DAs selected"),
y=alt.Y("current_percentage", title="Average current commute using percentage", axis=alt.Axis(format='%')),
).properties(
title='Impacts of 10% increase in number of services per capita'
)
alt.Chart(X_2_eva_results).mark_line().encode(
x=alt.X("total_population", title="Total population of DAs selected"),
y=alt.Y("percentage_point_increase", title="Percentage point increase in transit use"),
).properties(
title='Impacts of 10% increase in number of services per capita'
)
alt.Chart(X_2_eva_results).mark_line().encode(
x=alt.X("total_population", title="Total population of DAs selected"),
y=alt.Y("percent_increase", title="Increase as percent of current transit use", axis=alt.Axis(format='%')),
).properties(
title='Impacts of 10% increase in number of services per capita'
)
Another perspective to look at out model from is with key DAs identified, how much better outcome we would have as compared to spending the same amount of resources on randomly chosen DAs. A simulation is conducted below.
def find_random_mask_1(population):
X_1_random = X_1.copy()
X_1_random["rand_selected"] = False
pop_sum = 0
while True:
rand_ind = np.random.randint(low = 0, high = X_1.shape[0])
X_1_random.loc[rand_ind, "rand_selected"] = True
pop_sum = X_1_random.loc[X_1_random["rand_selected"] == True, "vn13"].sum()
if pop_sum > population:
break
return X_1_random["rand_selected"]
rand_X_1_eva_results = {"current_percentage": [], "percentage_point_increase":[], "total_population":X_1_eva_results["total_population"]}
for pop in X_1_eva_results["total_population"]:
mask = find_random_mask_1(pop)
rand_X_1_eva_results["current_percentage"].append(y[mask == True].mean())
rand_X_1_eva_results["percentage_point_increase"].append(X_1.loc[mask == True, "rf_pp_increase"].mean()*100)
rand_X_1_eva_results = pd.DataFrame(rand_X_1_eva_results)
rand_X_1_eva_results["percent_increase"] = rand_X_1_eva_results["percentage_point_increase"] / 100 / rand_X_1_eva_results["current_percentage"]
rand_X_1_eva_results["top_pct_s1"] = X_1_eva_results["top_pct_s1"]
X_1_eva_results["policy"] = "Selected by model"
rand_X_1_eva_results["policy"] = "Random baseline"
rand_X_1_eva_results = rand_X_1_eva_results[list(X_1_eva_results.columns)]
X_1_charts_data = pd.concat([X_1_eva_results, rand_X_1_eva_results], ignore_index=True)
X_1_charts_data["benefited_population"] = X_1_charts_data["percentage_point_increase"] * X_1_charts_data["total_population"] / 100
X_1_benefited_population_compare = alt.Chart(X_1_charts_data).mark_line().encode(
x=alt.X("total_population", title="Total population of DAs selected"),
y=alt.Y("benefited_population", title="Total increase in population using transit"),
color=alt.Color("policy")
).properties(
title='Impacts of 1.2 increase in number of services per capita: model vs. randomly selected DAs'
)
X_1_benefited_population_compare.save(
os.path.join(
"Figures",
data_version,
"X_1_benefited_population_compare.png",
)
)
X_1_benefited_population_compare
X_1_benefited_population_compare_for_post = alt.Chart(X_1_charts_data).mark_line().encode(
x=alt.X("total_population", title="Total population of neighborhoods selected"),
y=alt.Y("benefited_population", title="Total increase in population using transit"),
color=alt.Color("policy")
).properties(
title='Impacts of 1.2 increase in number of services per capita: model vs. randomly selected neighborhoods'
)
X_1_benefited_population_compare_for_post.save(
os.path.join(
"Figures",
data_version,
"X_1_benefited_population_compare_for_post.png",
)
)
X_1_current_precentage_compare = alt.Chart(X_1_charts_data).mark_line().encode(
x=alt.X("total_population", title="Total population of DAs selected"),
y=alt.Y("current_percentage", title="Average current commute using percentage", axis=alt.Axis(format='%')),
color=alt.Color("policy")
).properties(
title='Impacts of 1.2 increase in number of services per capita: model vs. randomly selected DAs'
)
X_1_current_precentage_compare.save(
os.path.join(
"Figures",
data_version,
"X_1_current_percentage_compare.png",
)
)
X_1_current_precentage_compare
X_1_percentage_point_increase_compare = alt.Chart(X_1_charts_data).mark_line().encode(
x=alt.X("total_population", title="Total population of DAs selected"),
y=alt.Y("percentage_point_increase", title="Percentage point increase in transit use"),
color=alt.Color("policy")
).properties(
title='Impacts of 1.2 increase in number of services per capita: model vs. randomly selected DAs'
)
X_1_percentage_point_increase_compare.save(
os.path.join(
"Figures",
data_version,
"X_1_percentage_point_increase_compare.png",
)
)
X_1_percentage_point_increase_compare
X_1_percent_increase_compare = alt.Chart(X_1_charts_data).mark_line().encode(
x=alt.X("total_population", title="Total population of DAs selected"),
y=alt.Y("percent_increase", title="Increase as percent of current transit use", axis=alt.Axis(format='%')),
color=alt.Color("policy")
).properties(
title='Impacts of 1.2 increase in number of services per capita: model vs. randomly selected DAs'
)
X_1_percent_increase_compare.save(
os.path.join(
"Figures",
data_version,
"X_1_percent_increase_compare.png",
)
)
X_1_percent_increase_compare
Here, a baseline case is used to compare with out scenario 1. We randomly DAs, whose population, when added up, equal to the total population of DAs selected in scenario 1, given the specified percent of DAs selected. Three things stand out:
Firstly, DAs identified where increase of transit service per capita by 1.2 results in the most increase in proportion of people using public transit tend to have lower levels. In average, about 20% residents in GVA use public tranist to commute. However, out model suggests that with limited resources, we should focus on DAs where only about 10%-12% residents use public transit first.
Secondly, our model seems promising in selecting DAs where percentage point of transit use increases the most. If randomly selected, 1.2 increase in number of services per capita results in about 0.1 percentage point increase in proportion of transit use. By contrast, even if we target DAs with 300,000 population in total, they in average have 0.45 percentage point increase in proportion of transit use.
Similarly, selected DAs will also have large increase in transit use, as measured by increase as a percentage of current transit use. For example, if we target DAs with 250,000 population in total, their proportion of transit use will increase by about 4% in average. By contrast, for randomly selected DAs, if their number of services per capita increases by 1.2, they will only have less than 1% increase in transit use.
def find_random_mask_2(population):
X_2_random = X_2.copy()
X_2_random["rand_selected"] = False
pop_sum = 0
while True:
rand_ind = np.random.randint(low = 0, high = X_2.shape[0])
X_2_random.loc[rand_ind, "rand_selected"] = True
pop_sum = X_2_random.loc[X_2_random["rand_selected"] == True, "vn13"].sum()
if pop_sum > population:
break
return X_2_random["rand_selected"]
rand_X_2_eva_results = {"current_percentage": [], "percentage_point_increase":[], "total_population":X_2_eva_results["total_population"]}
for pop in X_2_eva_results["total_population"]:
mask = find_random_mask_1(pop)
rand_X_2_eva_results["current_percentage"].append(y[mask == True].mean())
rand_X_2_eva_results["percentage_point_increase"].append(X_2.loc[mask == True, "rf_pp_increase"].mean()*100)
rand_X_2_eva_results = pd.DataFrame(rand_X_2_eva_results)
rand_X_2_eva_results["percent_increase"] = rand_X_2_eva_results["percentage_point_increase"]/ 100 / rand_X_2_eva_results["current_percentage"]
rand_X_2_eva_results["top_pct_s2"] = X_2_eva_results["top_pct_s2"]
X_2_eva_results["policy"] = "Selected by model"
rand_X_2_eva_results["policy"] = "Random baseline"
rand_X_2_eva_results = rand_X_2_eva_results[list(X_2_eva_results.columns)]
X_2_charts_data = pd.concat([X_2_eva_results, rand_X_2_eva_results], ignore_index=True)
X_2_charts_data["benefited_population"] = X_2_charts_data["percentage_point_increase"] * X_2_charts_data["total_population"] / 100
X_2_benefited_population_compare = alt.Chart(X_2_charts_data).mark_line().encode(
x=alt.X("total_population", title="Total population of DAs selected"),
y=alt.Y("benefited_population", title="Total increase in population using transit"),
color=alt.Color("policy")
).properties(
title='Impacts of 10% increase in number of services per capita: model vs. randomly selected DAs'
)
X_2_benefited_population_compare.save(
os.path.join(
"Figures",
data_version,
"X_2_benefited_population_compare.png",
)
)
X_2_benefited_population_compare
X_2_benefited_population_compare_for_post = alt.Chart(X_2_charts_data).mark_line().encode(
x=alt.X("total_population", title="Total population of neighborhoods selected"),
y=alt.Y("benefited_population", title="Total increase in population using transit"),
color=alt.Color("policy")
).properties(
title='Impacts of 10% increase in number of services per capita: model vs. randomly selected neighborhoods'
)
X_2_benefited_population_compare_for_post.save(
os.path.join(
"Figures",
data_version,
"X_2_benefited_population_compare_for_post.png",
)
)
X_2_current_precentage_compare = alt.Chart(X_2_charts_data).mark_line().encode(
x=alt.X("total_population", title="Total population of DAs selected"),
y=alt.Y("current_percentage", title="Average current commute using percentage", axis=alt.Axis(format='%')),
color=alt.Color("policy")
).properties(
title='Impacts of 10% increase in number of services per capita: model vs. randomly selected DAs'
)
X_2_current_precentage_compare.save(
os.path.join(
"Figures",
data_version,
"X_2_current_percentage_compare.png",
)
)
X_2_current_precentage_compare
X_2_percentage_point_increase_compare = alt.Chart(X_2_charts_data).mark_line().encode(
x=alt.X("total_population", title="Total population of DAs selected"),
y=alt.Y("percentage_point_increase", title="Percentage point increase in transit use"),
color=alt.Color("policy")
).properties(
title='Impacts of 10% increase in number of services per capita: model vs. randomly selected DAs'
)
X_2_percentage_point_increase_compare.save(
os.path.join(
"Figures",
data_version,
"X_2_percentage_point_increase_compare.png",
)
)
X_2_percentage_point_increase_compare
X_2_percent_increase_compare = alt.Chart(X_2_charts_data).mark_line().encode(
x=alt.X("total_population", title="Total population of DAs selected"),
y=alt.Y("percent_increase", title="Increase as percent of current transit use", axis=alt.Axis(format='%')),
color=alt.Color("policy")
).properties(
title='Impacts of 10% increase in number of services per capita: model vs. randomly selected DAs'
)
X_2_percent_increase_compare.save(
os.path.join(
"Figures",
data_version,
"X_2_percent_increase_compare.png",
)
)
X_2_percent_increase_compare
As shown in the charts above, the results in scenario 2 is similar to scenario 1.
These results should be taken with a grain of salt.
It is unrealistic to expect that some infrastructure development will only increase service in selected DAs while keeping service in other DAs constant. This point has two implications. On the one hand, even the transit authority focuses solely on increasing service in identified areas, other areas across GVA will benefit. On the other hand, a "increasing service by 10% in 10% of all DAs" scenario does not mean that the system's variable cost will increase by 1% (10% * 10%). Most likely, the real cost increase will be significantly bigger.
In addition, our model only takes into account a limited number of factors, and there are certainly importrant variables which would help to build our predictive models, but we do not have data for. For example, we do not know whether public transportation has been in a DA long enough for people there to have a habit of using transit.
Lastly, there is this fundamental problem of applying inference models to prediction. Without randomized trials, which seems extremely expensive in our case, we cannot know for sure whether the seemlying strong relationship between access to transit service and transit use is causal. Therefore, when we pour resourse into these areas, the outcome may not be as strong as what we might expect.